BHAVESHKUMAR THAKER
To build network intrusion detection system to detect anomalies and attacks in the Network. There are two problems.
| feature name | description | type |
|---|---|---|
| duration | length (number of seconds) of the connection | continuous |
| protocol_type | type of the protocol, e.g. tcp, udp, etc. | discrete |
| service | network service on the destination, e.g., http, telnet, etc. | discrete |
| src_bytes | number of data bytes from source to destination | continuous |
| dst_bytes | number of data bytes from destination to source | continuous |
| flag | normal or error status of the connection | discrete |
| land | 1 if connection is from/to the same host/port; 0 otherwise | discrete |
| wrong_fragment | number of "wrong" fragments | continuous |
| urgent | number of urgent packets | continuous |
| feature name | description | type |
|---|---|---|
| hot | number of "hot" indicators | continuous |
| num_failed_logins | number of failed login attempts | continuous |
| logged_in | 1 if successfully logged in; 0 otherwise | discrete |
| num_compromised | number of "compromised" conditions | continuous |
| root_shell | 1 if root shell is obtained; 0 otherwise | discrete |
| su_attempted | 1 if "su root" command attempted; 0 otherwise | discrete |
| num_root | number of "root" accesses | continuous |
| num_file_creations | number of file creation operations | continuous |
| num_shells | number of shell prompts | continuous |
| num_access_files | number of operations on access control files | continuous |
| num_outbound_cmds | number of outbound commands in an ftp session | continuous |
| is_hot_login | 1 if the login belongs to the "hot" list; 0 otherwise | discrete |
| is_guest_login | 1 if the login is a "guest"login; 0 otherwise | discrete |
| feature name | description | type |
|---|---|---|
| count | number of connections to the same host as the current connection in the past two seconds | continuous |
| Note: The following features refer to these same-host connections. | ||
| serror_rate | % of connections that have "SYN" errors | continuous |
| rerror_rate | % of connections that have "REJ" errors | continuous |
| same_srv_rate | % of connections to the same service | continuous |
| diff_srv_rate | % of connections to different services | continuous |
| srv_count | number of connections to the same service as the current connection in the past two seconds | continuous |
| Note: The following features refer to these same-service connections. | ||
| srv_serror_rate | % of connections that have "SYN" errors | continuous |
| srv_rerror_rate | % of connections that have "REJ" errors | continuous |
| srv_diff_host_rate | % of connections to different hosts | continuous |
import time
notebookstart = time.time()
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import platform
import sys
import importlib
import multiprocessing
import random
import numpy as np
import pandas as pd
random.seed(321)
np.random.seed(321)
pd.options.display.max_columns = 9999
belize_light_flavor = [
'#5899DA',
'#E8743B',
'#19A979',
'#ED4A7B',
'#945ECF',
'#13A4B4',
'#525DF4',
'#BF399E',
'#6C8893',
'#EE6868',
'#2F6497',
]
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
mpl.rc('figure', figsize=(15, 12))
plt.figure(figsize=(15, 12))
plt.rcParams['figure.facecolor'] = 'lightcyan'
mpl.style.use('seaborn')
plt.style.use('seaborn')
belize_light_flavor_cmap = mpl.colors.ListedColormap(belize_light_flavor)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
import seaborn as sns
sns.set(rc={'figure.figsize': (15, 12)})
sns.set(
context='notebook',
style='darkgrid',
font='sans-serif',
font_scale=1.1,
rc={'figure.facecolor': 'lightcyan', 'axes.facecolor': 'lightcyan'
, 'grid.color': 'steelblue'},
)
sns.color_palette(belize_light_flavor);
# https://anaconda.org/anaconda/plotly
# conda install -c anaconda plotly
plotly_check = importlib.util.find_spec("plotly")
found = plotly_check is not None
if found:
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
else:
!conda install --yes --prefix {sys.prefix} plotly
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
from statsmodels.graphics.mosaicplot import mosaic
# https://anaconda.org/conda-forge/missingno
# conda install -c conda-forge missingno
missingno_check = importlib.util.find_spec("missingno")
found = missingno_check is not None
if found:
import missingno as msno
else:
!conda install --yes --prefix {sys.prefix} -c conda-forge missingno
import missingno as msno
# https://anaconda.org/conda-forge/scikit-plot
# conda install -c conda-forge scikit-plot
scikitplot_check = importlib.util.find_spec("scikitplot")
found = scikitplot_check is not None
if found:
import scikitplot as skplt
else:
#!conda install --yes --prefix {sys.prefix} -c conda-forge scikit-plot
!pip install scikit-plot
import scikitplot as skplt
import sklearn
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer
from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import average_precision_score, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
# https://anaconda.org/conda-forge/xgboost
# conda install -c conda-forge xgboost
xgboost_check = importlib.util.find_spec('xgboost')
found = xgboost_check is not None
if found:
import xgboost as xgb
from xgboost import XGBClassifier
else:
!conda install --yes --prefix {sys.prefix} -c conda-forge xgboost
import xgboost as xgb
from xgboost import XGBClassifier
# https://anaconda.org/conda-forge/catboost
# conda install -c conda-forge catboost
catboost_check = importlib.util.find_spec('catboost')
found = catboost_check is not None
if found:
import catboost
from catboost import CatBoostClassifier
else:
#!conda install --yes --prefix {sys.prefix} -c conda-forge catboost
!pip install catboost
import catboost
from catboost import CatBoostClassifier
# https://anaconda.org/conda-forge/lightgbm
# conda install -c conda-forge lightgbm
lightgbm_check = importlib.util.find_spec('lightgbm')
found = lightgbm_check is not None
if found:
import lightgbm as lgbm
else:
!conda install --yes --prefix {sys.prefix} -c conda-forge lightgbm
import lightgbm as lgbm
# https://anaconda.org/conda-forge/scikit-optimize
# conda install -c conda-forge scikit-optimize
skopt_check = importlib.util.find_spec('skopt')
found = skopt_check is not None
if found:
import skopt
from skopt import BayesSearchCV
else:
#!conda install --yes --prefix {sys.prefix} -c conda-forge scikit-optimize
!pip install scikit-optimize
import skopt
from skopt import BayesSearchCV
# https://anaconda.org/conda-forge/hyperopt
# conda install -c conda-forge hyperopt
hyperopt_check = importlib.util.find_spec('hyperopt')
found = hyperopt_check is not None
if found:
import hyperopt
from hyperopt import fmin, hp, tpe, rand, Trials, space_eval, STATUS_OK, STATUS_FAIL
else:
!conda install --yes --prefix {sys.prefix} -c conda-forge hyperopt
import hyperopt
from hyperopt import fmin, hp, tpe, rand, Trials, space_eval, STATUS_OK, STATUS_FAIL
import tensorflow as tf
tf.set_random_seed(321)
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization, GaussianNoise
from keras.callbacks import EarlyStopping
from keras import regularizers
print('Operating system version........', platform.platform())
print('Python version is............... %s.%s.%s' % sys.version_info[:3])
print('scikit-learn version is.........', sklearn.__version__)
print('pandas version is...............', pd.__version__)
print('numpy version is................', np.__version__)
print('matplotlib version is...........', mpl.__version__)
print('seaborn version is..............', sns.__version__)
print('plotly version is...............', plotly.__version__)
print('scikit-plot version is..........', skplt.__version__)
print('missingno version is............', msno.__version__)
print('xgboost version is..............', xgb.__version__)
print('catboost version is.............', catboost.__version__)
print('lightgbm version is.............', lgbm.__version__)
print('scikit-optimize version is......', skopt.__version__)
print('hyperopt version is.............', hyperopt.__version__)
print('tensorflow version is...........', tf.__version__)
print('keras version is................', keras.__version__)
def getDatasetInformation(csv_filepath, is_corr_required=True):
"""
Read CSV (comma-separated) file into DataFrame
Returns,
- DataFrame
- DataFrame's shape
- DataFrame's data types
- DataFrame's describe
- DataFrame's sorted unique value count
- DataFrame's missing or NULL value count
- DataFrame's correlation between numerical columns
"""
dataset_tmp = pd.read_csv(csv_filepath, header=None, index_col=None)
dataset_tmp.columns = columns_name
dataset_tmp_shape = pd.DataFrame(list(dataset_tmp.shape),
index=['No of Rows', 'No of Columns'], columns=['Total'])
dataset_tmp_shape = dataset_tmp_shape.reset_index()
dataset_tmp_dtypes = dataset_tmp.dtypes.reset_index()
dataset_tmp_dtypes.columns = ['Column Names', 'Column Data Types']
dataset_tmp_desc = pd.DataFrame(dataset_tmp.describe())
dataset_tmp_desc = dataset_tmp_desc.transpose()
dataset_tmp_unique = dataset_tmp.nunique().reset_index()
dataset_tmp_unique.columns = ['Column Name', 'Unique Value(s) Count'
]
dataset_tmp_missing = dataset_tmp.isnull().sum(axis=0).reset_index()
dataset_tmp_missing.columns = ['Column Names',
'NULL value count per Column']
dataset_tmp_missing = \
dataset_tmp_missing.sort_values(by='NULL value count per Column'
, ascending=False)
if is_corr_required:
dataset_tmp_corr = dataset_tmp.corr(method='spearman')
else:
dataset_tmp_corr = pd.DataFrame()
return [
dataset_tmp,
dataset_tmp_shape,
dataset_tmp_dtypes,
dataset_tmp_desc,
dataset_tmp_unique,
dataset_tmp_missing,
dataset_tmp_corr,
]
def getHighlyCorrelatedColumns(dataset, NoOfCols=6):
df_corr = dataset.corr()
# set the correlations on the diagonal or lower triangle to zero,
# so they will not be reported as the highest ones
df_corr *= np.tri(k=-1, *df_corr.values.shape).T
df_corr = df_corr.stack()
df_corr = \
df_corr.reindex(df_corr.abs().sort_values(ascending=False).index).reset_index()
return df_corr.head(NoOfCols)
def createFeatureEngineeredColumns(dataset):
dataset_tmp = pd.DataFrame()
dataset_tmp['CountOfZeroValues'] = (dataset == 0).sum(axis=1)
dataset_tmp['CountOfNonZeroValues'] = (dataset != 0).sum(axis=1)
weight = ((dataset != 0).sum() / len(dataset)).values
dataset_tmp['WeightedCount'] = (dataset * weight).sum(axis=1)
dataset_tmp['SumOfValues'] = dataset.sum(axis=1)
dataset_tmp['VarianceOfValues'] = dataset.var(axis=1)
dataset_tmp['MedianOfValues'] = dataset.median(axis=1)
dataset_tmp['MeanOfValues'] = dataset.mean(axis=1)
dataset_tmp['StandardDeviationOfValues'] = dataset.std(axis=1)
dataset_tmp['ModeOfValues'] = dataset.mode(axis=1)
dataset_tmp['SkewOfValues'] = dataset.skew(axis=1)
dataset_tmp['KurtosisOfValues'] = dataset.kurtosis(axis=1)
dataset_tmp['MaxOfValues'] = dataset.max(axis=1)
dataset_tmp['MinOfValues'] = dataset.min(axis=1)
dataset_tmp['DiffOfMinMaxOfValues'] = \
np.subtract(dataset_tmp['MaxOfValues'],
dataset_tmp['MinOfValues'])
dataset_tmp['QuantilePointFiveOfValues'] = dataset[dataset
> 0].quantile(0.5, axis=1)
dataset = pd.concat([dataset, dataset_tmp], axis=1)
return dataset
def getZeroStdColumns(dataset):
columnsWithZeroStd = dataset.columns[dataset.std() == 0].tolist()
return columnsWithZeroStd
def getUniqueValueColumns(dataset, valueToCheck=0):
columnsWithUniqueValue = dataset.columns[dataset.nunique()
== valueToCheck].tolist()
return columnsWithUniqueValue
def getScaledDataset(dataset, scaleType='StandardScaler'):
if scaleType == 'StandardScaler':
scaler = StandardScaler()
scaled_tmp = scaler.fit_transform(dataset)
elif scaleType == 'MinMaxScaler':
scaler = MinMaxScaler()
scaled_tmp = scaler.fit_transform(dataset)
elif scaleType == 'RobustScaler':
scaler = RobustScaler()
scaled_tmp = scaler.fit_transform(dataset)
elif scaleType == 'MaxAbsScaler':
scaler = MaxAbsScaler()
scaled_tmp = scaler.fit_transform(dataset)
elif scaleType == 'Normalizer':
scaler = Normalizer()
scaled_tmp = scaler.fit_transform(dataset)
if scaler is None:
return [dataset, _]
else:
dataset = pd.DataFrame(scaled_tmp)
return [dataset, scaler]
def plot_countplot(x, title='', xtitle=''):
ncount = len(x)
ax = sns.countplot(x=x)
plt.title(title, fontsize=18)
plt.xlabel(xtitle, fontsize=14)
legend_labels = x.unique()
plt.legend(legend_labels, ncol=1, loc='best')
# Make twin axis
ax2 = ax.twinx()
# Switch so count axis is on right, frequency on left
ax2.yaxis.tick_left()
ax.yaxis.tick_right()
# Also switch the labels over
ax2.yaxis.set_label_position('left')
ax.yaxis.set_label_position('right')
ax2.set_ylabel('Frequency [%]', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
for p in ax.patches:
x = p.get_bbox().get_points()[:, 0]
y = p.get_bbox().get_points()[1, 1]
ax.annotate('{:.1f}%'.format(100. * y / ncount), (x.mean(), y),
ha='center', va='bottom') # set the alignment of the text
# Use a LinearLocator to ensure the correct number of ticks
ax.yaxis.set_major_locator(ticker.LinearLocator(11))
# Fix the frequency range to 0-100
ax2.set_ylim(0, 100)
ax.set_ylim(0, ncount)
# And use a MultipleLocator to ensure a tick spacing of 10
ax2.yaxis.set_major_locator(ticker.MultipleLocator(10))
# Need to turn the grid on ax2 off, otherwise the gridlines end up on top of the bars
ax2.grid(None)
def plot_valuecount_pieplot(x, title=''):
x_value_count = x.value_counts()
x_value_index = x_value_count.index
pieplot = plt.pie(x_value_count, labels=x_value_index,
autopct='%1.1f%%', shadow=True, startangle=195)
pieplot = plt.title(title, fontsize=18)
pieplot = plt.axis('equal')
plt.show()
def plot_boxplot(x, y, title=''):
boxplot = sns.boxplot(x=x, y=y, palette=belize_light_flavor);
boxplot = plt.title(title, fontsize=18)
plt.xticks(rotation=90)
plt.show()
def plot_distplot(dataset):
import matplotlib.colors as mcolors
colors = mcolors.TABLEAU_COLORS
dataset_fordist = dataset.select_dtypes([np.int, np.float])
number_of_subplots = len(dataset_fordist.columns)
number_of_columns = 3
number_of_rows = number_of_subplots // number_of_columns
number_of_rows += number_of_subplots % number_of_columns
postion = range(1, number_of_subplots + 1)
fig = plt.figure(1)
for k in range(number_of_subplots):
ax = fig.add_subplot(number_of_rows, number_of_columns,
postion[k])
sns.distplot(dataset_fordist.iloc[:, k],
color=random.choice(list(colors.keys())), ax=ax)
fig.tight_layout()
plt.show()
def getCategoricalVariableDistributionGraph(target_value, title=''):
tmp_count = target_value.value_counts()
figureCVDG = tools.make_subplots(rows=1, cols=2, shared_yaxes=True,
subplot_titles=('Distribution Graph',
'Distribution Graph - Bar'))
figureCVDG.append_trace(go.Scatter(x=tmp_count.index, y=tmp_count,
mode='markers+lines', connectgaps=True), 1,
1)
figureCVDG.append_trace(go.Bar(x=tmp_count.index, y=tmp_count), 1,
2)
figureCVDG['layout'].update(title=title,
titlefont=dict(family='Arial',
size=36), paper_bgcolor='#ffffcf',
plot_bgcolor='#ffffcf')
py.iplot(figureCVDG)
def getPlotlyLayout(title='', xtitle='', ytitle=''):
layout = go.Layout(
title=title,
showlegend=True,
hovermode='closest',
paper_bgcolor='#ffffcf',
plot_bgcolor='#ffffcf',
titlefont=dict(family='Arial', size=36),
xaxis=dict(title=xtitle, titlefont=dict(family='Arial',
size=18), tickfont=dict(family='Arial', size=14)),
yaxis=dict(title=ytitle, titlefont=dict(family='Arial',
size=18), tickfont=dict(family='Arial', size=14)),
)
return layout
class PseudoLabeler(BaseEstimator, ClassifierMixin):
'''
Sci-kit learn wrapper for creating pseudo-lebeled estimators.
'''
def __init__(self, model, unlabled_data, features, target, sample_rate=0.2, seed=42):
'''
@sample_rate - percent of samples used as pseudo-labelled data
from the unlabled dataset
'''
assert sample_rate <= 1.0, 'Sample_rate should be between 0.0 and 1.0.'
self.sample_rate = sample_rate
self.seed = seed
self.model = model
self.model.seed = seed
self.unlabled_data = unlabled_data
self.features = features
self.target = target
def get_params(self, deep=True):
return {
"sample_rate": self.sample_rate,
"seed": self.seed,
"model": self.model,
"unlabled_data": self.unlabled_data,
"features": self.features,
"target": self.target
}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
def fit(self, X, y):
'''
Fit the data using pseudo labeling.
'''
augemented_train = self.__create_augmented_train(X, y)
self.model.fit(
augemented_train[self.features],
augemented_train[self.target]
)
return self
def __create_augmented_train(self, X, y):
'''
Create and return the augmented_train set that consists
of pseudo-labeled and labeled data.
'''
num_of_samples = int(len(self.unlabled_data) * self.sample_rate)
# Train the model and creat the pseudo-labels
self.model.fit(X, y)
pseudo_labels = self.model.predict(self.unlabled_data[self.features])
# Add the pseudo-labels to the test set
pseudo_data = self.unlabled_data.copy(deep=True)
pseudo_data[self.target] = pseudo_labels
# Take a subset of the test set with pseudo-labels and append in onto
# the training set
sampled_pseudo_data = pseudo_data.sample(n=num_of_samples)
temp_train = pd.concat([X, y], axis=1)
augemented_train = pd.concat([sampled_pseudo_data, temp_train])
return shuffle(augemented_train)
def predict(self, X):
'''
Returns the predicted values.
'''
return self.model.predict(X)
def predict_proba(self, X):
'''
Returns the proba.
'''
return self.model.predict_proba(X)
def get_model_name(self):
return self.model.__class__.__name__
def convertIntFloatToInt(dictObj):
for (k, v) in dictObj.items():
if float('Inf') == v:
pass
elif int(v) == v and isinstance(v, float):
dictObj[k] = int(v)
return dictObj
def attackTypeNumConverter(attack_type):
if attack_type == 'normal':
return 0
else:
return 1
def attackTypeConverter(attack_type):
if attack_type == 'normal':
return 'Normal'
else:
return 'Attack'
dos_list = [
'back',
'land',
'neptune',
'pod',
'smurf',
'teardrop',
'apache2',
'udpstorm',
'processtable',
'worm',
]
probe_list = [
'satan',
'ipsweep',
'nmap',
'portsweep',
'mscan',
'saint',
]
r2l_list = [
'guess_password',
'ftp_write',
'imap',
'phf',
'multihop',
'warezmaster',
'warezclient',
'spy',
'xlock',
'xsnoop',
'snmpguess',
'snmpgetattack',
'httptunnel',
'sendmail',
'named',
]
u2r_list = [
'buffer_overflow',
'loadmodule',
'rootkit',
'perl',
'sqlattack',
'xterm',
'ps',
]
def attackTypeMultiNumConverter(attack_type_value):
if attack_type_value in dos_list:
return 1
elif attack_type_value in probe_list:
return 2
elif attack_type_value in r2l_list:
return 3
elif attack_type_value in u2r_list:
return 4
else:
return 0
def attackTypeMultiConverter(attack_type_value):
if attack_type_value in dos_list:
return 'DoS'
elif attack_type_value in probe_list:
return 'Probe'
elif attack_type_value in r2l_list:
return 'R2L'
elif attack_type_value in u2r_list:
return 'U2R'
else:
return 'Normal'
root_dir = ''
try:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/Colab Notebooks/Telecom Network Anomaly Detection/'
!ls '/content/gdrive/My Drive/Colab Notebooks/Telecom Network Anomaly Detection'
except:
print('No GOOGLE DRIVE connection. Using local dataset(s).')
columns_name = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_count',
'dst_host_diff_srv_count',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'attack_type',
'last_flag',
]
(
dataset_network_train,
df_train_shape,
df_train_dtypes,
df_train_describe,
df_train_unique,
df_train_missing,
df_train_corr,
) = getDatasetInformation(root_dir + 'Train.txt', False)
(
dataset_network_test,
df_test_shape,
df_test_dtypes,
df_test_describe,
df_test_unique,
df_test_missing,
df_test_corr,
) = getDatasetInformation(root_dir + 'Test.txt', False)
df_train_shape
df_test_shape
dataset_network_train_rows = dataset_network_train.shape[0]
dataset_network_test_rows = dataset_network_test.shape[0]
dataset_network_train.head()
dataset_network_test.head()
df_train_dtypes
df_test_dtypes
df_train_describe
df_test_describe
df_train_unique
df_test_unique
df_train_missing
msno.matrix(dataset_network_train, color=(33 / 255, 102 / 255, 172 / 255));
df_test_missing
msno.matrix(dataset_network_test, color=(33 / 255, 102 / 255, 172 / 255));
plot_distplot(dataset_network_train)
plot_distplot(dataset_network_test)
dataset_network_train.attack_type.unique()
dataset_network_train.attack_type.value_counts()
plot_valuecount_pieplot(dataset_network_train.attack_type, 'Attack Type (train feature) Distribution Percentage')
dataset_network_test.attack_type.unique()
dataset_network_test.attack_type.value_counts()
plot_valuecount_pieplot(dataset_network_test.attack_type, 'Attack Type (test feature) Distribution Percentage')
dataset_network_train.last_flag.unique()
dataset_network_train.last_flag.value_counts()
plot_valuecount_pieplot(dataset_network_train.last_flag, 'Last Flag (train feature) Distribution Percentage')
dataset_network_test.last_flag.unique()
dataset_network_test.last_flag.value_counts()
plot_valuecount_pieplot(dataset_network_test.last_flag, 'Last Flag (test feature) Distribution Percentage')
plt.subplot(121)
plot_valuecount_pieplot(dataset_network_train.protocol_type, 'Protocol Type (train feature) Distribution Percentage')
plt.subplot(122)
plot_valuecount_pieplot(dataset_network_test.protocol_type, 'Protocol Type (test feature) Distribution Percentage')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_valuecount_pieplot(dataset_network_train.service, 'Service (train feature) Distribution Percentage')
plt.subplot(122)
plot_valuecount_pieplot(dataset_network_test.service, 'Service (test feature) Distribution Percentage')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_valuecount_pieplot(dataset_network_train.flag, 'Flag (train feature) Distribution Percentage')
plt.subplot(122)
plot_valuecount_pieplot(dataset_network_test.flag, 'Flag (test feature) Distribution Percentage')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_countplot(dataset_network_train.land, 'Land (train feature) Distribution', 'Land values')
plt.subplot(122)
plot_countplot(dataset_network_test.land, 'Land (test feature) Distribution', 'Land values')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_countplot(dataset_network_train.logged_in, 'Logged In (train feature) Distribution', 'Logged In values')
plt.subplot(122)
plot_countplot(dataset_network_test.logged_in, 'Logged In (test feature) Distribution', 'Logged In values')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_countplot(dataset_network_train.root_shell, 'Root Shell (train feature) Distribution', 'Root Shell values')
plt.subplot(122)
plot_countplot(dataset_network_test.root_shell, 'Root Shell (test feature) Distribution', 'Root Shell values')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_countplot(dataset_network_train.su_attempted, 'su Attempted (train feature) Distribution', 'su Attempted values')
plt.subplot(122)
plot_countplot(dataset_network_test.su_attempted, 'su Attempted (test feature) Distribution', 'su Attempted values')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_countplot(dataset_network_train.is_host_login, 'Is Host Login (train feature) Distribution', 'Is Host Login values')
plt.subplot(122)
plot_countplot(dataset_network_test.is_host_login, 'Is Host Login (test feature) Distribution', 'Is Host Login values')
plt.tight_layout()
plt.show()
plt.subplot(121)
plot_countplot(dataset_network_train.is_guest_login, 'Is Guest Login (train feature) Distribution', 'Is Guest Login values')
plt.subplot(122)
plot_countplot(dataset_network_test.is_guest_login, 'Is Guest Login (test feature) Distribution', 'Is Guest Login values')
plt.tight_layout()
plt.show()
plt.subplot(211)
plot_boxplot(dataset_network_train.attack_type,
dataset_network_train.dst_host_count,
title='attack_type v/s dst_host_count (train data)'
);
plt.subplot(211)
plot_boxplot(dataset_network_test.attack_type,
dataset_network_test.dst_host_count,
title='attack_type v/s dst_host_count (test data)'
);
plt.show()
plt.subplot(211)
plot_boxplot(dataset_network_train.attack_type,
dataset_network_train.dst_host_srv_count,
title='attack_type v/s dst_host_srv_count (train data)'
);
plt.subplot(211)
plot_boxplot(dataset_network_test.attack_type,
dataset_network_test.dst_host_srv_count,
title='attack_type v/s dst_host_srv_count (test data)'
);
plt.show()
plt.subplot(211)
plot_boxplot(dataset_network_train.attack_type,
dataset_network_train.is_guest_login,
title='attack_type v/s is_guest_login (train data)'
);
plt.subplot(211)
plot_boxplot(dataset_network_test.attack_type,
dataset_network_test.is_guest_login,
title='attack_type v/s is_guest_login (test data)'
);
plt.show()
df_corr = dataset_network_train.corr(method='spearman')
df_corr
# Generate a mask for the upper triangle
#mask = np.zeros_like(df_corr, dtype=np.bool)
#mask[np.triu_indices_from(mask)] = True
sns.heatmap(df_corr, cmap='rainbow', annot=False, fmt=".2f", center=0, square=False, linewidths=.75,
#mask=mask,
);
plt.title('Correlation Matrix', fontsize=18)
plt.show()
getHighlyCorrelatedColumns(dataset_network_train, 10)
upper = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print(f'Columns to drop from Train and Test datasets are {to_drop}.')
dataset_network_train.drop(columns=to_drop, axis=1, inplace=True)
dataset_network_test.drop(columns=to_drop, axis=1, inplace=True)
(dataset_network_train.shape, dataset_network_test.shape)
dataset_network_train['attack_type_twoclass'] = dataset_network_train['attack_type'].apply(attackTypeConverter)
dataset_network_test['attack_type_twoclass'] = dataset_network_test['attack_type'].apply(attackTypeConverter)
dataset_network_train['attack_type_twoclass_num'] = dataset_network_train['attack_type'].apply(attackTypeNumConverter)
dataset_network_test['attack_type_twoclass_num'] = dataset_network_test['attack_type'].apply(attackTypeNumConverter)
plt.subplot(221)
plot_countplot(dataset_network_train.attack_type_twoclass, 'Attack Type - Two Class (train feature) Distribution', 'Attack Type(s)')
plt.subplot(222)
plot_valuecount_pieplot(dataset_network_train.attack_type_twoclass, 'Attack Type - Two Class (train feature) Distribution Percentage')
plt.subplot(223)
plot_countplot(dataset_network_test.attack_type_twoclass, 'Attack Type - Two Class (test feature) Distribution', 'Attack Type(s)')
plt.subplot(224)
plot_valuecount_pieplot(dataset_network_test.attack_type_twoclass, 'Attack Type - Two Class (test feature) Distribution Percentage')
plt.tight_layout()
plt.show()
dataset_network_train['attack_type_fiveclass'] = dataset_network_train['attack_type'].apply(attackTypeMultiConverter)
dataset_network_test['attack_type_fiveclass'] = dataset_network_test['attack_type'].apply(attackTypeMultiConverter)
dataset_network_train['attack_type_fiveclass_num'] = dataset_network_train['attack_type'].apply(attackTypeMultiNumConverter)
dataset_network_test['attack_type_fiveclass_num'] = dataset_network_test['attack_type'].apply(attackTypeMultiNumConverter)
plt.subplot(221)
plot_countplot(dataset_network_train.attack_type_fiveclass, 'Attack Type - Five Class (train feature) Distribution', 'Attack Type(s)')
plt.subplot(222)
plot_valuecount_pieplot(dataset_network_train.attack_type_fiveclass, 'Attack Type - Five Class (train feature) Distribution Percentage')
plt.subplot(223)
plot_countplot(dataset_network_test.attack_type_fiveclass, 'Attack Type - Five Class (test feature) Distribution', 'Attack Type(s)')
plt.subplot(224)
plot_valuecount_pieplot(dataset_network_test.attack_type_fiveclass, 'Attack Type - Five Class (test feature) Distribution Percentage')
plt.tight_layout()
plt.show()
(dataset_network_train.shape, dataset_network_test.shape)
sns.lmplot(x='dst_host_same_src_port_rate', y='dst_host_srv_diff_host_rate', hue='attack_type_fiveclass',
data=dataset_network_train, size=11
);
plt.title('"dst_host_same_src_port_rate" vs "dst_host_srv_diff_host_rate"', fontsize=18)
plt.show()
sns.lmplot(x='duration', y='src_bytes', hue='attack_type_fiveclass', data=dataset_network_train, size=11);
plt.title('"duration" vs "src_bytes"', fontsize=18)
plt.show()
sns.lmplot(x='dst_host_count', y='serror_rate', hue='attack_type_fiveclass', data=dataset_network_train, size=11);
plt.title('"dst_host_count" vs "serror_rate"', fontsize=18)
plt.show()
sns.lmplot(x='count', y='serror_rate', hue='attack_type_fiveclass', data=dataset_network_train, size=11);
plt.title('"count" vs "serror_rate"', fontsize=18)
plt.show()
sns.pointplot(x='flag', y='land', hue='attack_type_fiveclass', data=dataset_network_train, join=False);
plt.title('"flag" vs "land"', fontsize=18)
plt.show()
mosaic(dataset_network_train, ['service', 'protocol_type']);
plt.title('"service" vs "protocol_type"', fontsize=18)
plt.show()
mosaic(dataset_network_train, ['service', 'flag']);
plt.title('"service" vs "flag"', fontsize=18)
plt.show()
unique_protocol_type = pd.concat([dataset_network_train.protocol_type,
dataset_network_test.protocol_type],
ignore_index=True).unique().ravel()
unique_service = pd.concat([dataset_network_train.service,
dataset_network_test.service],
ignore_index=True).unique().ravel()
unique_flag = pd.concat([dataset_network_train.flag,
dataset_network_test.flag],
ignore_index=True).unique().ravel()
print(unique_protocol_type)
print(unique_service)
print(unique_flag)
def protocolTypeNumConverter(protocol_type):
if protocol_type == 'tcp' :
return 0
elif protocol_type == 'udp' :
return 1
elif protocol_type == 'icmp' :
return 2
def flagNumConverter(flag):
if flag == 'SF' :
return 0
elif flag == 'S0' :
return 1
elif flag == 'REJ' :
return 2
elif flag == 'RSTR' :
return 3
elif flag == 'SH' :
return 4
elif flag == 'RSTO' :
return 5
elif flag == 'S1' :
return 6
elif flag == 'RSTOS0' :
return 7
elif flag == 'S3' :
return 8
elif flag == 'S2' :
return 9
elif flag == 'OTH' :
return 10
serviceNumConverter = {
'ftp_data': 0,
'other': 1,
'private': 2,
'http': 3,
'remote_job': 4,
'name': 5,
'netbios_ns': 6,
'eco_i': 7,
'mtp': 8,
'telnet': 9,
'finger': 10,
'domain_u': 11,
'supdup': 12,
'uucp_path': 13,
'Z39_50': 14,
'smtp': 15,
'csnet_ns': 16,
'uucp': 17,
'netbios_dgm': 18,
'urp_i': 19,
'auth': 20,
'domain': 21,
'ftp': 22,
'bgp': 23,
'ldap': 24,
'ecr_i': 25,
'gopher': 26,
'vmnet': 27,
'systat': 28,
'http_443': 29,
'efs': 30,
'whois': 31,
'imap4': 32,
'iso_tsap': 33,
'echo': 34,
'klogin': 35,
'link': 36,
'sunrpc': 37,
'login': 38,
'kshell': 39,
'sql_net': 40,
'time': 41,
'hostnames': 42,
'exec': 43,
'ntp_u': 44,
'discard': 45,
'nntp': 46,
'courier': 47,
'ctf': 48,
'ssh': 49,
'daytime': 50,
'shell': 51,
'netstat': 52,
'pop_3': 53,
'nnsp': 54,
'IRC': 55,
'pop_2': 56,
'printer': 57,
'tim_i': 58,
'pm_dump': 59,
'red_i': 60,
'netbios_ssn': 61,
'rje': 62,
'X11': 63,
'urh_i': 64,
'http_8001': 65,
'aol': 66,
'http_2784': 67,
'tftp_u': 68,
'harvest': 69,
}
dataset_network_train['protocol_type'] = dataset_network_train['protocol_type'].apply(protocolTypeNumConverter)
dataset_network_test['protocol_type'] = dataset_network_test['protocol_type'].apply(protocolTypeNumConverter)
dataset_network_train.flag = dataset_network_train.flag.apply(flagNumConverter)
dataset_network_test.flag = dataset_network_test.flag.apply(flagNumConverter)
dataset_network_train.service = [serviceNumConverter[item] for item in dataset_network_train.service]
dataset_network_test.service = [serviceNumConverter[item] for item in dataset_network_test.service]
# Using map() function
# dataset_network_train['service'] = dataset_network_train['service'].map(serviceNumConverter).astype(int)
# dataset_network_test['service'] = dataset_network_test['service'].map(serviceNumConverter).astype(int)
X_train=dataset_network_train.drop(['attack_type',
'attack_type_twoclass','attack_type_twoclass_num',
'attack_type_fiveclass','attack_type_fiveclass_num'
], axis=1
)
y_train=dataset_network_train['attack_type_twoclass_num']
z_train=dataset_network_train['attack_type_fiveclass_num']
X_test=dataset_network_test.drop(['attack_type',
'attack_type_twoclass', 'attack_type_twoclass_num',
'attack_type_fiveclass', 'attack_type_fiveclass_num'
], axis=1
)
y_test=dataset_network_test['attack_type_twoclass_num']
z_test=dataset_network_test['attack_type_fiveclass_num']
columnsWithZeroStdToRemove = getZeroStdColumns(X_train)
print(f'Columns with Zero STD to drop from Train and Test dataset(s) are {columnsWithZeroStdToRemove}.')
X_train.drop(columnsWithZeroStdToRemove, axis=1, inplace=True)
X_test.drop(columnsWithZeroStdToRemove, axis=1, inplace=True)
(X_train.shape, y_train.shape, z_train.shape)
(X_test.shape, y_test.shape, z_test.shape)
onetoten = [i for i in range(1,11)]
wcss = []
for i in onetoten:
kmeans = KMeans(n_clusters = i, init = 'k-means++')
kmeans.fit_predict(X_train)
wcss.append(kmeans.inertia_)
# The Elbow Method Plot using Plotly
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# data = [go.Scatter(x = onetoten, y = wcss, mode='lines+markers',
# marker = dict(symbol = 'circle',),
# )
# ]
# layout = getPlotlyLayout('KMeans - The Elbow Method','Number of clusters','WCSS')
# figure = dict(data = data, layout = layout)
# py.iplot(figure)
# The Elbow Method Plot using matplotlib
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
plt.plot(range(1, 11), wcss, marker='o')
plt.title('KMeans - The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.fit_transform(X_test)
# X_train = pd.DataFrame(X_train_scaled)
# X_test = pd.DataFrame(X_test_scaled)
n_cpus_avaliable = multiprocessing.cpu_count()
print(f'We\'ve got {n_cpus_avaliable} cpus to work with.')
#earlyStopping = EarlyStopping(monitor='val_binary_accuracy', patience=1, verbose=0, mode='max')
kerasnn_model = Sequential()
kerasnn_model.add(Dense(32, input_shape=(X_train.shape[1], ), activation='relu',
kernel_regularizer=regularizers.l2(0.01),
))
kerasnn_model.add(Dropout(0.25))
kerasnn_model.add(BatchNormalization())
kerasnn_model.add(Dense(64, activation='relu',
kernel_regularizer=regularizers.l2(0.01),
))
kerasnn_model.add(GaussianNoise(0.1))
kerasnn_model.add(Dense(128, activation='relu',
kernel_regularizer=regularizers.l2(0.01),
))
kerasnn_model.add(Dropout(0.25))
kerasnn_model.add(BatchNormalization())
kerasnn_model.add(Dense(128, activation='relu',
kernel_regularizer=regularizers.l2(0.01),
))
kerasnn_model.add(GaussianNoise(0.1))
kerasnn_model.add(Dense(64, activation='relu',
kernel_regularizer=regularizers.l2(0.01),
))
kerasnn_model.add(Dropout(0.25))
kerasnn_model.add(BatchNormalization())
kerasnn_model.add(Dense(32, activation='relu',
kernel_regularizer=regularizers.l2(0.01),
))
kerasnn_model.add(Dense(1, activation='sigmoid'))
kerasnn_model.summary()
kerasnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['binary_accuracy'])
#kerasnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=9, callbacks=[earlyStopping], shuffle=True, verbose=1)
kerasnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, shuffle=True, verbose=0)
kerasnn_model.evaluate(X_test, y_test)
# Predicting the Test set results
y_pred = kerasnn_model.predict_classes(X_test)
y_pred_proba = kerasnn_model.predict_proba(X_test)
ac = accuracy_score(y_test, y_pred)
print('The accuracy score of the Keras NN (Two Class) model is: {}%'.format(ac * 100))
print('\n')
cr = classification_report(y_test, y_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(y_test, y_pred, title='Binomial Classification (Keras NN Confusion Matrix)',
x_tick_rotation=90,
cmap='Oranges',
);
def lgbm_status_print_twoclass(optimal_result):
all_models = pd.DataFrame(lgbm_bayes_cv_tuner_twoclass.cv_results_)
best_params = pd.Series(lgbm_bayes_cv_tuner_twoclass.best_params_)
print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(len(all_models),
np.round(lgbm_bayes_cv_tuner_twoclass.best_score_, 4),
lgbm_bayes_cv_tuner_twoclass.best_params_)
)
lgbm_bayes_cv_tuner_twoclass = BayesSearchCV(
estimator=lgbm.LGBMClassifier(n_jobs=n_cpus_avaliable,
objective='binary',
metric='binary_logloss',
class_weight='balanced',
silent=True),
search_spaces={
'boosting_type': ['gbdt', 'dart', 'rf'],
'num_leaves': (1, 50),
'max_depth': (1, 25),
'learning_rate': (0.01, 1.0, 'log-uniform'),
'n_estimators': (100, 300),
'min_split_gain': (0.01, 1.0, 'uniform'),
'min_child_weight': (0.01, 1.0, 'uniform'),
'min_child_samples': (1, 50),
'subsample': (0.01, 1.0, 'uniform'),
'subsample_freq': (1, 50),
'colsample_bytree': (0.01, 1.0, 'uniform'),
'reg_lambda': (1e-9, 1000, 'log-uniform'),
'bagging_fraction': (0.01, 1.0, 'uniform'),
'feature_fraction': (0.01, 1.0, 'uniform'),
},
scoring='roc_auc',
cv=StratifiedKFold(n_splits=9, shuffle=True),
n_jobs=n_cpus_avaliable,
n_iter=5,
refit=True,
verbose=0,
)
lgbm_result_twoclass = lgbm_bayes_cv_tuner_twoclass.fit(X_train, y_train,
callback=lgbm_status_print_twoclass)
lgbm_twoclass_model = lgbm_result_twoclass.best_estimator_
print(lgbm_twoclass_model)
# Predicting the Test set results
y_pred = lgbm_twoclass_model.predict(X_test)
y_pred_proba = lgbm_twoclass_model.predict_proba(X_test)
ac = accuracy_score(y_test, y_pred)
print('The accuracy score of the LightGBM (Two Class) model is: {}%'.format(ac * 100))
print('\n')
cr = classification_report(y_test, y_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(y_test, y_pred, title='Binomial Classification (LightGBM Confusion Matrix)',
x_tick_rotation=90,
cmap='Oranges',
);
print('\n')
skplt.metrics.plot_precision_recall(y_test, y_pred_proba,
title='Binomial Classification (LightGBM Precision-Recall Curve)',
);
print('\n')
skplt.metrics.plot_roc(y_test, y_pred_proba,
title='Binomial Classification (LightGBM ROC Curves)',
);
feature_importance = pd.DataFrame({'imp': lgbm_twoclass_model.feature_importances_, 'col': X_train.columns})
feature_importance = feature_importance.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
feature_importance.plot(kind='barh', x='col', y='imp', color=belize_light_flavor);
plt.title('Binomial Classification (LightGBM - Feature Importance(s))', fontsize=18)
plt.show()
def lgbmc_objective(params):
params = {
'num_leaves': int(params['num_leaves']),
'max_depth': int(params['max_depth']),
'n_estimators': int(params['n_estimators']),
'min_child_samples': int(params['min_child_samples']),
'subsample_freq': int(params['min_child_samples']),
}
lgbmClassifier = lgbm.LGBMClassifier(
n_jobs=n_cpus_avaliable,
objective='binary',
metric='binary_logloss',
class_weight='balanced',
silent=True,
**params
)
# if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used.
lgbmcScore = cross_val_score(lgbmClassifier, X_train, y_train,
cv=StratifiedKFold(n_splits=9,
shuffle=True),
n_jobs=n_cpus_avaliable).mean()
# print('LGBM-Classifier Score: {:.4f}, Parameters are: {}'.format(lgbmcScore, params))
return lgbmcScore
lgbmc_space = {
'num_leaves': hp.quniform('num_leaves', 1, 50, 1),
'max_depth': hp.quniform('max_depth', 1, 25, 1),
'learning_rate': hp.qloguniform('learning_rate', 0.01, 1.0, 0.01),
'n_estimators': hp.quniform('n_estimators', 100, 300, 2),
'min_split_gain': hp.quniform('min_split_gain', 0.01, 1.0, 0.01),
'min_child_weight': hp.quniform('min_child_weight', 0.01, 1.0,
0.01),
'min_child_samples': hp.quniform('min_child_samples', 1, 50, 1),
'subsample': hp.quniform('subsample', 0.01, 1.0, 0.01),
'subsample_freq': hp.quniform('subsample_freq', 1, 50, 1),
'colsample_bytree': hp.quniform('colsample_bytree', 0.01, 1.0,
0.01),
'reg_lambda': hp.qloguniform('reg_lambda', 1e-5, 1000, 0.01),
'bagging_fraction': hp.quniform('bagging_fraction', 0.01, 1.0,
0.01),
'feature_fraction': hp.quniform('feature_fraction', 0.01, 1.0,
0.01),
}
trials = Trials()
lgbmc_best = fmin(fn=lgbmc_objective, space=lgbmc_space,
algo=tpe.suggest, max_evals=5, trials=trials)
print(trials.losses())
print('\n')
print(lgbmc_best)
space_eval(lgbmc_space, lgbmc_best)
lgbmc_best=convertIntFloatToInt(lgbmc_best)
lgbmc_best
lgbmClassifierHyperopt = lgbm.LGBMClassifier(
n_jobs=n_cpus_avaliable,
objective='binary',
metric='binary_logloss',
class_weight='balanced',
silent=True,
**lgbmc_best
)
lgbmClassifierHyperopt.fit(X_train, y_train)
# Predicting the Test set results
y_pred = lgbmClassifierHyperopt.predict(X_test)
y_pred_proba = lgbmClassifierHyperopt.predict_proba(X_test)
ac = accuracy_score(y_test, y_pred)
print('The accuracy score of the LightGBM Hyperopt (Two Class) model is: {}%'.format(ac * 100))
print('\n')
cr = classification_report(y_test, y_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(y_test, y_pred, title='Binomial Classification (LightGBM Hyperopt Confusion Matrix)',
x_tick_rotation=90,
cmap='Oranges',
);
print('\n')
skplt.metrics.plot_precision_recall(y_test, y_pred_proba,
title='Binomial Classification (LightGBM Hyperopt Precision-Recall Curve)',
);
print('\n')
skplt.metrics.plot_roc(y_test, y_pred_proba,
title='Binomial Classification (LightGBM Hyperopt ROC Curves)',
);
feature_importance = pd.DataFrame({'imp': lgbmClassifierHyperopt.feature_importances_, 'col': X_train.columns})
feature_importance = feature_importance.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
feature_importance.plot(kind='barh', x='col', y='imp', color=belize_light_flavor);
plt.title('Binomial Classification (LightGBM - Feature Importance(s))', fontsize=18)
plt.show()
def status_print_twoclass(optimal_result):
all_models = pd.DataFrame(bayes_cv_tuner_twoclass.cv_results_)
best_params = pd.Series(bayes_cv_tuner_twoclass.best_params_)
print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(len(all_models),
np.round(bayes_cv_tuner_twoclass.best_score_, 4),
bayes_cv_tuner_twoclass.best_params_)
)
bayes_cv_tuner_twoclass = BayesSearchCV(
estimator=xgb.XGBClassifier(
n_jobs=n_cpus_avaliable,
objective='binary:logistic',
eval_metric='auc',
silent=1,
tree_method='approx',
device='cpu',
),
search_spaces={
'booster': ['gbtree', 'dart'],
'learning_rate': (0.01, 1.0, 'log-uniform'),
'max_delta_step': (0, 20),
'max_depth': (0, 25),
'min_child_weight': (0, 10),
'n_estimators': (100, 300),
'subsample': (0.01, 1.0, 'uniform'),
'colsample_bytree': (0.01, 1.0, 'uniform'),
'colsample_bylevel': (0.01, 1.0, 'uniform'),
'reg_lambda': (1e-9, 1000, 'log-uniform'),
'gamma': (1e-9, 0.5, 'log-uniform'),
'scale_pos_weight': (1e-6, 500, 'log-uniform'),
},
scoring='roc_auc',
cv=StratifiedKFold(n_splits=9, shuffle=True),
n_jobs=n_cpus_avaliable,
n_iter=7,
refit=True,
verbose=0,
)
result_twoclass = bayes_cv_tuner_twoclass.fit(X_train, y_train,
callback=status_print_twoclass)
xgb_twoclass_model = result_twoclass.best_estimator_
print(xgb_twoclass_model)
# Predicting the Test set results
y_pred = xgb_twoclass_model.predict(X_test)
y_pred_proba = xgb_twoclass_model.predict_proba(X_test)
ac = accuracy_score(y_test, y_pred)
print('The accuracy score of the XGBoost (Two Class) model is: {}%'.format(ac * 100))
print('\n')
cr = classification_report(y_test, y_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(y_test, y_pred, title='Binomial Classification (XGBoost Confusion Matrix)',
x_tick_rotation=90,
cmap='Oranges',
);
print('\n')
skplt.metrics.plot_precision_recall(y_test, y_pred_proba,
title='Binomial Classification (XGBoost Precision-Recall Curve)',
);
print('\n')
skplt.metrics.plot_roc(y_test, y_pred_proba,
title='Binomial Classification (XGBoost ROC Curves)',
);
feature_importance = pd.DataFrame({'imp': xgb_twoclass_model.feature_importances_, 'col': X_train.columns})
feature_importance = feature_importance.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
feature_importance.plot(kind='barh', x='col', y='imp', color=belize_light_flavor);
plt.title('Binomial Classification (XGBoost - Feature Importance(s))', fontsize=18)
plt.show()
features = X_train.columns
target = 'attack_type_twoclass_num'
num_folds = 11
xgb_pseudolabeler_twoclass_model = PseudoLabeler(
xgb_twoclass_model,
X_test,
features,
target,
sample_rate = 0.3
)
xgb_pseudolabeler_twoclass_model.fit(X_train, y_train)
#y_pred = xgb_pseudolabeler_twoclass_model.predict(X_test)
scores = cross_val_score(xgb_pseudolabeler_twoclass_model, X_train, y_train, cv=num_folds, scoring='roc_auc', n_jobs=1)
scores
xgb_pseudolabeler_twoclass_model
y_pred = xgb_pseudolabeler_twoclass_model.predict(X_test)
y_pred_proba = xgb_pseudolabeler_twoclass_model.predict_proba(X_test)
ac = accuracy_score(y_test, y_pred)
print('The accuracy score of the XGBoost (Semi-Supervised Model) (Two Class) model is: {}%'.format(ac * 100))
print('\n')
cr = classification_report(y_test, y_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(y_test, y_pred, title='Binomial Classification (XGBoost (Semi-Supervised Model) Confusion Matrix)',
x_tick_rotation=90,
cmap='Oranges',
);
print('\n')
skplt.metrics.plot_precision_recall(y_test, y_pred_proba,
title='Binomial Classification (XGBoost (Semi-Supervised Model) Precision-Recall Curve)',
);
print('\n')
skplt.metrics.plot_roc(y_test, y_pred_proba,
title='Binomial Classification (XGBoost (Semi-Supervised Model) ROC Curves)',
);
feature_importance = pd.DataFrame({'imp': xgb_pseudolabeler_twoclass_model.model.feature_importances_, 'col': X_train.columns})
feature_importance = feature_importance.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
feature_importance.plot(kind='barh', x='col', y='imp', color=belize_light_flavor);
plt.title('Binomial Classification (XGBoost (Semi-Supervised Model) - Feature Importance(s))', fontsize=18)
plt.show()
def lgbm_status_print_fiveclass(optimal_result):
all_models = pd.DataFrame(lgbm_bayes_cv_tuner_fiveclass.cv_results_)
best_params = pd.Series(lgbm_bayes_cv_tuner_fiveclass.best_params_)
print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(len(all_models),
np.round(lgbm_bayes_cv_tuner_fiveclass.best_score_, 4),
lgbm_bayes_cv_tuner_fiveclass.best_params_)
)
lgbm_bayes_cv_tuner_fiveclass = BayesSearchCV(
estimator=lgbm.LGBMClassifier(
n_jobs=n_cpus_avaliable,
objective='multiclass',
metric='multi_logloss',
num_class=5,
class_weight='balanced',
silent=True,
),
search_spaces={
'boosting_type': ['gbdt', 'dart', 'rf'],
'num_leaves': (1, 50),
'max_depth': (1, 25),
'learning_rate': (0.01, 1.0, 'log-uniform'),
'n_estimators': (100, 300),
'min_split_gain': (0.01, 1.0, 'uniform'),
'min_child_weight': (0.01, 1.0, 'uniform'),
'min_child_samples': (1, 50),
'subsample': (0.01, 1.0, 'uniform'),
'subsample_freq': (1, 50),
'colsample_bytree': (0.01, 1.0, 'uniform'),
'reg_lambda': (1e-5, 1000, 'log-uniform'),
'bagging_fraction': (0.01, 1.0, 'uniform'),
'feature_fraction': (0.01, 1.0, 'uniform'),
},
cv=StratifiedKFold(n_splits=9, shuffle=True),
n_jobs=n_cpus_avaliable,
n_iter=5,
refit=True,
verbose=0,
)
lgbm_result_fiveclass = lgbm_bayes_cv_tuner_fiveclass.fit(X_train, z_train,
callback=lgbm_status_print_fiveclass)
lgbm_fiveclass_model = lgbm_result_fiveclass.best_estimator_
print(lgbm_fiveclass_model)
# Predicting the Test set results
z_pred = lgbm_fiveclass_model.predict(X_test)
z_pred_proba = lgbm_fiveclass_model.predict_proba(X_test)
ac = accuracy_score(z_test, z_pred)
print('The accuracy score of the LightGBM (Five Class) model is: {}%'.format(ac
* 100))
print('\n')
cr = classification_report(z_test, z_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(z_test, z_pred,
title='Multinomial Classification (LightGBM Confusion Matrix)'
, x_tick_rotation=90, cmap='Oranges'
)
print('\n')
skplt.metrics.plot_precision_recall(z_test, z_pred_proba,
title='Multinomial Classification (LightGBM Precision-Recall Curve)'
)
print('\n')
skplt.metrics.plot_roc(z_test, z_pred_proba,
title='Multinomial Classification (LightGBM ROC Curves)'
)
feature_importance = \
pd.DataFrame({'imp': lgbm_fiveclass_model.feature_importances_,
'col': X_train.columns})
feature_importance = feature_importance.sort_values(['imp', 'col'],
ascending=[True, False]).iloc[-30:]
feature_importance.plot(kind='barh', x='col', y='imp', color=belize_light_flavor)
plt.title('Multinomial Classification (LightGBM - Feature Importance(s))'
, fontsize=18)
plt.show()
def status_print(optimal_result):
all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)
best_params = pd.Series(bayes_cv_tuner.best_params_)
print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(len(all_models),
np.round(bayes_cv_tuner.best_score_, 4),
bayes_cv_tuner.best_params_)
)
bayes_cv_tuner = BayesSearchCV(
estimator=xgb.XGBClassifier(
n_jobs=n_cpus_avaliable,
objective='binary:logistic',
eval_metric='auc',
silent=1,
tree_method='approx',
nthread=n_cpus_avaliable,
),
search_spaces={
# 'booster': ['gbtree', 'dart'],
'learning_rate': (0.01, 1.0, 'log-uniform'),
'min_child_weight': (0, 10),
'max_depth': (0, 25),
'max_delta_step': (0, 20),
'subsample': (0.01, 1.0, 'uniform'),
'colsample_bytree': (0.01, 1.0, 'uniform'),
'colsample_bylevel': (0.01, 1.0, 'uniform'),
'reg_lambda': (1e-9, 1000, 'log-uniform'),
'gamma': (1e-9, 0.5, 'log-uniform'),
'min_child_weight': (0, 10),
'n_estimators': (100, 300),
'scale_pos_weight': (1e-6, 500, 'log-uniform'),
},
cv=StratifiedKFold(n_splits=9, shuffle=True),
n_jobs=n_cpus_avaliable,
n_iter=9,
refit=True,
verbose=0,
)
result = bayes_cv_tuner.fit(X_train, z_train, callback=status_print)
xgb_fiveclass_model = result.best_estimator_
print(xgb_fiveclass_model)
# Predicting the Test set results
z_pred = xgb_fiveclass_model.predict(X_test)
z_pred_proba = xgb_fiveclass_model.predict_proba(X_test)
ac = accuracy_score(z_test, z_pred)
print('The accuracy score of the XGBoost (Five Class) model is: {}%'.format(ac
* 100))
print('\n')
cr = classification_report(z_test, z_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(z_test, z_pred,
title='Multinomial Classification (XGBoost Confusion Matrix)'
, x_tick_rotation=90, cmap='Oranges'
)
print('\n')
skplt.metrics.plot_precision_recall(z_test, z_pred_proba,
title='Multinomial Classification (XGBoost Precision-Recall Curve)'
)
print('\n')
skplt.metrics.plot_roc(z_test, z_pred_proba,
title='Multinomial Classification (XGBoost ROC Curves)'
)
feature_importance = pd.DataFrame({'imp': xgb_fiveclass_model.feature_importances_, 'col': X_train.columns})
feature_importance = feature_importance.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
feature_importance.plot(kind='barh', x='col', y='imp', color=belize_light_flavor);
plt.title('Multinomial Classification (XGBoost - Feature Importance(s))', fontsize=18)
plt.show()
features = X_train.columns
target = 'attack_type_fiveclass_num'
num_folds = 11
xgb_pseudolabeler_fiveclass_model = PseudoLabeler(
xgb_fiveclass_model,
X_test,
features,
target,
sample_rate = 0.3
)
xgb_pseudolabeler_fiveclass_model.fit(X_train, z_train,)
#z_pred = xgb_pseudolabeler_fiveclass_model.predict(X_test)
scores = cross_val_score(xgb_pseudolabeler_fiveclass_model, X_train, z_train, cv=num_folds, n_jobs=1)
scores
xgb_pseudolabeler_fiveclass_model
z_pred = xgb_pseudolabeler_fiveclass_model.predict(X_test)
z_pred_proba = xgb_pseudolabeler_fiveclass_model.predict_proba(X_test)
ac = accuracy_score(z_test, z_pred)
print('The accuracy score of the XGBoost (Semi-Supervised Model) (Five Class) model is: {}%'.format(ac * 100))
print('\n')
cr = classification_report(z_test, z_pred)
print(cr)
print('\n')
skplt.metrics.plot_confusion_matrix(z_test, z_pred, title='Multinomial Classification (XGBoost (Semi-Supervised Model) Confusion Matrix)',
x_tick_rotation=90,
cmap='Oranges',
);
print('\n')
skplt.metrics.plot_precision_recall(z_test, z_pred_proba,
title='Multinomial Classification (XGBoost (Semi-Supervised Model) Precision-Recall Curve)',
);
print('\n')
skplt.metrics.plot_roc(z_test, z_pred_proba,
title='Multinomial Classification (XGBoost (Semi-Supervised Model) ROC Curves)',
);
feature_importance = pd.DataFrame({'imp': xgb_pseudolabeler_fiveclass_model.model.feature_importances_, 'col': X_train.columns})
feature_importance = feature_importance.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
feature_importance.plot(kind='barh', x='col', y='imp', color=belize_light_flavor);
plt.title('Multinomial Classification (XGBoost (Semi-Supervised Model) - Feature Importance(s))', fontsize=18)
plt.show()
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))